import random
import numpy as np
import os
import pandas as pd

root = "/Dataset/COVIDx-splitted-resized-112"
split = 'train'
data = pd.read_csv(f'{root}/{split}_COVIDx9A.txt', sep=" ", header=None)
data.columns = ["id", "filename", "diagnosis", "source"]
df = data[['filename', 'diagnosis']]

# create new column
df['covid_19']=0
df['pneumonia']=0

df.loc[df['diagnosis']=='pneumonia' , 'pneumonia']=1
df.loc[df['diagnosis']=='COVID-19' , 'covid_19']=1
df.loc[df['diagnosis']=='COVID-19' , 'pneumonia']=1

train_data= df[['covid_19', 'pneumonia', 'filename']]


train_data= train_data.sample(frac=1)

file_name= f"{root}/train_dataset.csv"
train_data.to_csv(file_name, encoding='utf-8', index=False)
print('saved at :',file_name)